from __future__ import print_function
import os.path
import pickle
import pandas as pd
import sys
sys.path.insert(0, '../../')
import numpy as np
import itertools
#import Datanalytics as da
from JKBio import TerraFunction as terra
from JKBio import Helper as h
from taigapy import TaigaClient
import dalmatian as dm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from JKBio.helper import pyDESeq2
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
import gseapy
from sklearn.preprocessing import scale
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from bokeh.plotting import *
from bokeh.models import HoverTool
import seaborn as sns
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()
! gsutil mv gs://transfer-amlproject/*MP7624* gs://transfer-amlproject/RNPv2/
! gsutil -m cp -r gs://transfer-amlproject/RNPv3 gs://amlproject/RNA/
! gsutil ls gs://amlproject/
sampleset='RNPv3'
terra.uploadFromFolder('amlproject','RNPv3/',
'broad-firecloud-ccle/hg38_RNAseq',samplesetname=sampleset,
fformat="fastqR1R2", sep='_MP7624')
wm = dm.WorkspaceManager('broad-firecloud-ccle/hg38_RNAseq')
submission_id = wm.create_submission("star_v1-0_BETA_cfg", sampleset, 'sample_set',expression='this.samples')
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
submission_id = wm.create_submission("rsem_v1-0_BETA_cfg",
sampleset,'sample_set',expression='this.samples')
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
submission_id = wm.create_submission("rsem_aggregate_results_v1-0_BETA_cfg",
sampleset)
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
results = wm.get_sample_sets().loc[sampleset]
rsem_genes_expected_count = results['rsem_genes_expected_count']
project = "RNPv2"
version = "scaling_2"
results
mkdir ../../data/$project
! gsutil cp $rsem_genes_expected_count ../../data/$project/
file = '../../data/'+project+'/'+rsem_genes_expected_count.split('/')[-1]
file
! gunzip $file
file
rsem_genes_expected_count = pd.read_csv(file[:-3], sep='\t')
rsem_genes_expected_count = pd.read_csv("../data/"+project+"/RNPv3.rsem_genes_expected_count.txt", sep='\t')
data = rsem_genes_expected_count.drop("transcript_id(s)",1)
data["gene_id"] = h.convertGenes(data['gene_id'])[0]
data=data.set_index('gene_id')
data
rename = {"1": "mr120-MV411-RNP_IRF2BP2-r4",
"2": "mr121-MV411-RNP_IRF2BP2-r5",
"3": "mr122-MV411-RNP_IRF2BP2-r6",
"4": "mr123-MV411-RNP_IRF8-r4",
"5": "mr124-MV411-RNP_IRF8-r5",
"6": "mr125-MV411-RNP_IRF8-r6",
"7": "mr126-MV411-RNP_MEF2D-r4",
"8": "mr127-MV411-RNP_MEF2D-r5",
"9": "mr128-MV411-RNP_MEF2D-r6",
"10": "mr129-MV411-RNP_MYC-r4",
"11": "mr130-MV411-RNP_MYC-r5",
"12": "mr131-MV411-RNP_MYC-r6",
"13": "mr132-MV411-RNP_RUNX1-r4",
"14": "mr133-MV411-RNP_RUNX1-r5",
"15": "mr134-MV411-RNP_RUNX1-r6",
"16": "mr135-MV411-RNP_RUNX2-r4",
"17": "mr136-MV411-RNP_RUNX2-r5",
"18": "mr137-MV411-RNP_RUNX2-r6",
"19": "mr138-MV411-RNP_SPI1-r4",
"20": "mr139-MV411-RNP_SPI1-r5",
"21": "mr140-MV411-RNP_SPI1-r6",
"22": "mr141-MV411-RNP_ZMYND8-r4",
"23": "mr142-MV411-RNP_ZMYND8-r5",
"24": "mr143-MV411-RNP_ZMYND8-r6",
"25": "mr144-MV411-RNP_LMO2-r4",
"26": "mr145-MV411-RNP_LMO2-r5",
"27": "mr146-MV411-RNP_LMO2-r6",
"28": "mr147-MV411-RNP_LYL1-r4",
"29": "mr148-MV411-RNP_LYL1-r5",
"30": "mr149-MV411-RNP_LYL1-r6",
"31": "mr150-MV411-RNP_MAX-r4",
"32": "mr151-MV411-RNP_MAX-r5",
"33": "mr152-MV411-RNP_MAX-r6",
"34": "mr153-MV411-RNP_ZEB2-r4",
"35": "mr154-MV411-RNP_ZEB2-r5",
"36": "mr155-MV411-RNP_ZEB2-r6",
"37": "mr156-MV411-RNP_MEF2C-r4",
"38": "mr157-MV411-RNP_MEF2C-r5",
"39": "mr158-MV411-RNP_MEF2C-r6",
"40": "mr159-MV411-RNP_MEIS1-r4",
"41": "mr160-MV411-RNP_MEIS1-r5",
"42": "mr161-MV411-RNP_MEIS1-r6",
"43": "mr162-MV411-RNP_FLI1-r4",
"44": "mr163-MV411-RNP_FLI1-r5",
"45": "mr164-MV411-RNP_FLI1-r6",
"46": "mr165-MV411-RNP_ELF2-r4",
"47": "mr166-MV411-RNP_ELF2-r5",
"48": "mr167-MV411-RNP_ELF2-r6",
"49": "mr168-MV411-RNP_GFI1-r4",
"50": "mr169-MV411-RNP_GFI1-r5",
"51": "mr170-MV411-RNP_GFI1-r6",
"52": "mr171-MV411-RNP_IKZF1-r4",
"53": "mr172-MV411-RNP_IKZF1-r5",
"54": "mr173-MV411-RNP_IKZF1-r6",
"55": "mr174-MV411-RNP_CEBPA-r4",
"56": "mr175-MV411-RNP_CEBPA-r5",
"57": "mr176-MV411-RNP_CEBPA-r6",
"58": "mr177-MV411-RNP_MYB-r4",
"59": "mr178-MV411-RNP_MYB-r5",
"60": "mr179-MV411-RNP_MYB-r6",
"61": "mr180-MV411-RNP_MYBL2-r1",
"62": "mr181-MV411-RNP_MYBL2-r2",
"63": "mr182-MV411-RNP_MYBL2-r3",
"64": "mr183-MV411-RNP_HOXA9-r4",
"65": "mr184-MV411-RNP_HOXA9-r5",
"66": "mr185-MV411-RNP_HOXA9-r6",
"67": "mr186-MV411-RNP_AAVS1-r1",
"68": "mr187-MV411-RNP_AAVS1-r2",
"69": "mr188-MV411-RNP_AAVS1-r3",
"70": "mr189-MV411-RNP_SPI1-r7",
"71": "mr190-MV411-RNP_SP1-r4",
"72": "mr191-MV411-RNP_SP1-r5",
"73": "mr192-MV411-RNP_SP1-r6"}
data.columns
data.columns = [rename[i] for i in data.columns]
data
filter some more
toremove = np.argwhere(data.values.var(1)==0)
toremove.ravel()
data = data.drop(data.iloc[toremove.ravel()].index,0)
data.shape
data[data.index.str.contains('ERCC')]
ERCC = data[~data.index.str.contains('ENSG00')]
ensg = data[data.index.str.contains('ENSG00')]
data = data[~data.index.str.contains('ENSG00')]
renormalize the data
len(ERCC)
ERCC
ctf=pd.read_csv('../data/CTF.csv',header=None)[0].values.tolist()
ctf
a = data.columns.tolist()
a.sort()
data =data[a]
%matplotlib inline
ig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(data.corr(),
xticklabels=data.columns,
yticklabels=data.columns, ax=ax)
plt.savefig('../results/'+project+'/plots/correlation_'+version+'.pdf')
data.to_csv('../results/'+project+'/'+version+'_counts.csv')
data = pd.read_csv('../results/'+project+'/'+version+'_counts.csv',index_col=0)
%matplotlib inline
sns.clustermap(data.corr(), figsize=(20, 20))
plt.savefig('../results/'+project+'/plots/cluster_corr_count_'+version+'.pdf')
data.sum().tolist()
data.shape
data= data.drop(columns=['mr138-MV411-RNP_SPI1-r4'])
ERCC = ERCC.drop(columns=['mr138-MV411-RNP_SPI1-r4'])
%matplotlib inline
sns.clustermap(data.corr(), figsize=(20, 20))
plt.savefig('../results/'+project+'/plots/cluster_corr_count_gene_removed_'+version+'.pdf')
%%R
library('erccdashboard')
ERCC = ERCC.astype(int)
ERCC['Feature'] = ERCC.index
ERCC
sns.heatmap(np.log2(ERCC[ERCC.index.str.contains('ERCC-')][['mr186-MV411-RNP_AAVS1-r1', 'mr187-MV411-RNP_AAVS1-r2', 'mr188-MV411-RNP_AAVS1-r3','mr129-MV411-RNP_MYC-r4', 'mr189-MV411-RNP_SPI1-r4', 'mr120-MV411-RNP_IRF2BP2-r4']].values / ERCC[ERCC.index.str.contains('ERCC-')][['mr186-MV411-RNP_AAVS1-r1', 'mr187-MV411-RNP_AAVS1-r2', 'mr188-MV411-RNP_AAVS1-r3','mr129-MV411-RNP_MYC-r4', 'mr189-MV411-RNP_SPI1-r4', 'mr120-MV411-RNP_IRF2BP2-r4']].values.mean(0)+1))
experiments = list(set([i.split('-')[2] for i in data.columns[:-1]]))
experiments.remove("RNP_AAVS1")
#TODO: compute the mass from concentration
###################################################
### code chunk number 3: defineInputData
###################################################
%R datType = "count" # "count" for RNA-Seq data, "array" for microarray data
%R isNorm = F # flag to indicate if input expression measures are already normalized, default is FALSE
%R -i project,version filenameRoot = paste("../",project,"/ERCCdashboard_",version, sep = "", collapse = NULL) # user defined filename prefix for results files
%R sample2Name = "AAVS1" # name for sample 2 in the experiment
%R erccmix = "Single" # name of ERCC mixture design, "RatioPair" is default
%R erccdilution = 1/100 # dilution factor used for Ambion spike-in mixtures
%R spikeVol = 1 # volume (in microliters) of diluted spike-in mixture added to total RNA mass
%R choseFDR = 0.1 # user defined false discovery rate (FDR), default is 0.05
cols = list(ERCC.columns)
cols.sort()
res={}
for val in experiments:
d = {}
e=0
d.update({
'Feature':'Feature'
})
for i in cols[1:]:
if val+'-' in i:
e+=1
d.update({i: val.split('_')[-1]+'_'+str(e)})
d.update({
'mr186-MV411-RNP_AAVS1-r1': 'AAVS1_1',
'mr187-MV411-RNP_AAVS1-r2': 'AAVS1_2',
})
if len(d)==6:
d.update({
'mr188-MV411-RNP_AAVS1-r3': 'AAVS1_3'
})
a = ERCC[list(d.keys())].rename(columns=d)
a.to_csv('../data/ERCC_estimation.csv', index=None)
val = val.split('_')[-1]
torm = '../results/'+project+'/ercc_'+version+'_'+val+'.AAVS1.All.Pvals.csv'
! rm $torm
%R -i val print(val)
%R print(sample2Name)
%R a <- read.csv('../data/ERCC_estimation.csv')
%R print(head(a))
%R exDat = ''
%R totalRNAmass <- 0.5
try:
%R -i val exDat = initDat(datType = datType, isNorm = isNorm, exTable = a, filenameRoot = filenameRoot, sample1Name = val, sample2Name = sample2Name, erccmix = erccmix, erccdilution = erccdilution, spikeVol = spikeVol, totalRNAmass = totalRNAmass, choseFDR = choseFDR)
%R exDat = est_r_m(exDat)
%R exDat = dynRangePlot(exDat)
except Warning:
print("failed for "+val)
continue
except:
print('worked for '+val)
%R print(summary(exDat))
%R grid.arrange(exDat$Figures$dynRangePlot)
%R grid.arrange(exDat$Figures$r_mPlot)
%R grid.arrange(exDat$Figures$rangeResidPlot)
%R -o rm rm <- exDat$Results$r_m.res$r_m.mn
%R -o se se <- exDat$Results$r_m.res$r_m.mnse
res[val] = (rm[0],se[0])
df= pd.DataFrame(data=res.values(),index=res.keys(), columns=['scaling','var'])
df['RNPs']=df.index
sns.barplot("RNPs","scaling",data=df,ci=None,)
plt.errorbar(x=range(0,len(df)),y=df['scaling'],
yerr=df['var'], fmt='none', c= 'r')
plt.xticks(rotation=60,ha='right')
plt.savefig('../results/'+project+"/plots/"+version+"_scaling_fact_with_conf.pdf")
scaling
for i, v in scaling.items():
if abs(v[0]) > v[1]:
print(i, v[0])
ERCC[ERCC.index.str.contains('ERCC-')][[i for i in ERCC.columns if 'AAVS1' in i]].mean()
ERCC[ERCC.index.str.contains('ERCC-')][[i for i in ERCC.columns if 'SPI1' in i]].mean()
scaling = res
h.dictToFile(scaling,"../results/"+project+"/"+version+"_scaling.json")
scaling = h.fileToDict("../results/"+project+"/"+version+"_scaling.json")
scaling
data['gene_id'] = data.index
housekeeping = ["C1orf43", "CHMP2A", "EMC7", "GPI", "PSMB2", "PSMB4", "RAB7A", "REEP5", "SNRPD3", "VCP", "VPS29"]
# from https://www.cell.com/trends/genetics/pdf/S0168-9525(13)00089-9.pdf
#results = {}
for val in experiments:
print(val)
design = pd.DataFrame(index=data.columns[:-1], columns=['DMSO','Target'],
data=np.array([[1 if 'RNP_AAVS1' in i else 0 for i in data.columns[:-1]],[1 if val+'-' in i else 0 for i in data.columns[:-1]]]).T)
design.index = design.index.astype(str).str.replace('-','.')
deseq = pyDESeq2.pyDESeq2(count_matrix=data, design_matrix = design,
design_formula='~DMSO + Target', gene_column="gene_id")
if abs(scaling[val.split('_')[1]][0]) > scaling[val.split('_')[1]][1]:
print("estimating sizeFactors for this one")
deseq.run_estimate_size_factors(controlGenes=data.gene_id.str.contains("ERCC-"))
elif val in results:
continue
deseq.run_deseq()
deseq.get_deseq_result()
r = deseq.deseq_result
r.pvalue = np.nan_to_num(np.array(r.pvalue), 1)
r.log2FoldChange = np.nan_to_num(np.array(r.log2FoldChange), 0)
results[val] = r
results
for val in experiments:
a = h.volcano(results[val],tohighlight=ctf,title=val, maxvalue= 60, searchbox=True, showlabels=True,folder="../results/"+project+"/plots/"+version+'_')
try:
show(a)
except RuntimeError:
show(a)
ls ../results/$project/plots/
!mkdir ../results/$project/deseq_$version
for k, val in results.items():
val.to_csv('../results/'+project+'/deseq_'+version+'/'+k+".csv")
results = {}
des = ! ls ../results/$project/deseq_$version/*.csv
for val in des:
results["RNP_"+val.split('RNP_')[-1].split('.')[0]] = pd.read_csv(val,index_col=0)
des
results.keys()
results.pop('RNP_all')
tosave = pd.DataFrame(index=results['RNP_CEBPA'].index)
for k,v in results.items():
tosave[k+'_fc_log2'] = v.log2FoldChange
tosave[k+'_padj'] = v.padj
tosave[k+'_pval'] = v.pvalue
tosave.to_csv('../results/'+project+'/deseq_RNP_all_'+version+'.csv')
ctf.extend(['IRF2BP2','MYBL2','IKZF1'])
deseq = pd.DataFrame(index=ctf)
for k, val in results.items():
deseq[k] = [i.log2FoldChange if i.pvalue<0.05 else 0 for a, i in val.loc[ctf].iterrows()]
deseq
fig = sns.clustermap(figsize=(25,20), cmap=plt.cm.RdYlBu, data=deseq,vmin=-1,vmax=1,xticklabels=deseq.columns, yticklabels=deseq.index)
fig.savefig('../results/'+project+'/plots/clustermap_ctf_deseq_all_scaled_'+version+'.pdf')
deseq.columns = [i.split('_')[1] for i in deseq.columns]
deseq = deseq.loc[deseq.columns]
deseq.to_csv('../results/'+project+'/deseq_CTFmat'+version+'.csv')
deseq = pd.read_csv('../results/'+project+'/deseq_CTFmat'+version+'.csv',index_col=0)
net = nx.from_pandas_adjacency(((deseq < -0.3) | (deseq > 0.3)).T,create_using=nx.DiGraph)
pos = nx.nx_agraph.graphviz_layout(net, prog="neato")
colors = ['red' if deseq.loc[i[1],i[0]]> 0 else 'blue' for i in net.edges]
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True,edge_color=colors)
plt.show()
plt.savefig("../results/"+project+"/plots/"+version+"_interaction_graph_binary.pdf")
deseq[(deseq > -0.3) & (deseq < 0.3)]=0
net = nx.from_pandas_adjacency(deseq.T,create_using=nx.DiGraph)
pos = nx.nx_agraph.graphviz_layout(net, prog='dot')
colors = [-deseq.loc[i[1],i[0]] for i in net.edges]
colors = [i/-min(colors) if i <0 else i/max(colors) for i in colors]
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True, edge_color=colors,edge_cmap=plt.cm.RdYlBu)
plt.show()
plt.savefig("../results/"+project+"/plots/"+version+"_interaction_graph_continuous.pdf")
col = {v:i for i, v in enumerate(set([i.split('-')[2] for i in data.columns[:-1]]))}
red = PCA(2).fit_transform(data[data.columns[:-1]].T)
h.scatter(red, labels=data.columns[:-1], title="PCA plot across replicates", radi=60000, colors=[col[i.split('-')[2]] for i in data.columns[:-1]], folder= "../results/"+project+"/plots/"+version+"_")
red = PCA(30).fit_transform(data[data.columns[:-1]].T)
red = TSNE(2,4).fit_transform(red)
h.scatter(red, labels=data.columns[:-1], title='TSNE plot across replicates', radi=10, colors=[col[i.split('-')[2]] for i in data.columns[:-1]], folder= "../results/"+project+"/plots/"+version+"_")
mr129-MYC-r4 seems weird
data
res = {}
experiments
res
for val in experiments:
print(val)
totest = data[[v for v in data.columns[:-1] if val+'-' in v or 'AAVS1' in v]]
cls = ['Condition' if val+'-' in v else 'DMSO' for v in totest.columns]
if abs(scaling[val.split('_')[1]][0]) > 3*scaling[val.split('_')[1]][1]:
print("rescaling this one")
cols = [i for i in totest.columns if val+'-' in i]
totest[cols] = totest[cols]*(2**scaling[val.split('_')[1]][0])
res[val] = gseapy.gsea(data=totest, gene_sets='WikiPathways_2013',
cls= cls, no_plot=False, processes=8)
res[val].res2d['Term'] = [i for i in res[val].res2d.index]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
with open('../data/'+project+'/'+version+'_wikipathway_RNPv2', 'wb') as f:
pickle.dump(res,f)
with open('../data/'+project+'/'+version+'_wikipathway_RNPv2','rb') as f:
res = pickle.load(f)
for val in experiments:
res[val].res2d['Term'] = [i[3:].split('WP')[0] for i in res[val].res2d['Term']]
a = sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size")
a.set_title(val)
a.figure.savefig('../results/'+ project+ '/plots/'+version + '_wikipathway_mainTerms.pdf')
plt.show()
a = set()
for k, val in res.items():
a.update(set(val.res2d.index))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
for i,v in val.res2d.iterrows():
a[i][n] = v.es
res = pd.DataFrame(a, index=res.keys())
res.columns = [i[3:].split('WP')[0] for i in res.columns]
res.index = [i.split('_')[1] for i in res.index]
fig = sns.clustermap(figsize=(25,20), data=res,vmin=-1,vmax=1,xticklabels=res.columns, yticklabels=res.index)
fig.savefig("../results/"+project+"/"+version+"enriched_terms_scaled_gsea.pdf")
res.to_csv('../results/'+project+'/'+version+'_wikipathway_gsea_matrix.csv')
res = {}
res.keys()
for i, val in enumerate(experiments):
print(val)
totest = data[[v for v in data.columns[:-1] if val+'-' in v or 'AAVS1' in v]]
cls = ['Condition' if val+'-' in v else 'DMSO' for v in totest.columns]
if abs(scaling[val.split('_')[1]][0]) > 3*scaling[val.split('_')[1]][1]:
print("rescaling this one")
cols = [i for i in totest.columns if val+'-' in i]
totest[cols] = totest[cols]*(2**scaling[val.split('_')[1]][0])
elif val in res:
continue
res[val] = gseapy.gsea(data=totest, gene_sets='GO_Biological_Process_2015',
cls= cls, no_plot=False, processes=8)
res[val].res2d['Term'] = [i for i in res[val].res2d.index]
plt.figure(i)
a = sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size")
a.set_title(val)
a.figure.savefig('../results/'+ project+ '/plots/'+version + '_gobioproc2015_mainTerms_'+val+'.pdf')
with open('../results/'+project+'/'+version+'_GO_Biological_Process_2015_RNPv2', 'wb') as f:
pickle.dump(res,f)
with open('../results/'+project+'/'+version+'_GO_Biological_Process_2015_RNPv2', 'wb') as f:
res = pickle.load(f)
for i, v in res.items():
res[i].res2d['Term'] = [i.split('(GO')[0] for i in v.res2d['Term']]
creating matrices
a = set()
for k, val in res.items():
a.update(set(val.res2d.Term))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
for i,v in val.res2d.iterrows():
a[v.Term][n] = v.es
res = pd.DataFrame(a, index=res.keys())
fig = sns.clustermap(figsize=(25,20), data=res,vmin=-1,vmax=1, yticklabels=res.index ,cmap=plt.cm.RdYlBu)
fig.savefig("../results/"+project+"/"+version+"enriched_terms_scaled_gsea_bioproc2015.pdf")
model = DBSCAN(eps=0.5, min_samples=3, metric='euclidean')
labels = model.fit_predict(res)
sort = labels.argsort()
a = sns.clustermap(-res.T.corr(),cmap=plt.cm.RdYlBu,vmin=-1,vmax=1)
a.savefig("../results/"+project+"/plots/"+version+"_clustermap_over_bioproc2015_geneset.pdf")
a = h.plotCorrelationMatrix(res.values[sort], res.index[sort].tolist(), interactive=True, title="correlation matrix over bioproc2015 gene sets", folder="../results/"+project+"/plots/"+version+"_")#,colors=[labels[i] for i in sort])
red = PCA(2).fit_transform(res)
h.scatter(red, labels=res.index, radi=1, colors=labels, title="PCA distance in geneset space", showlabels=True,folder="../results/"+project+"/plots/"+version+"_")
red = TSNE(2,2).fit_transform(res)
h.scatter(red, labels=res.index, radi=9, colors=labels, title="TSNE distance in geneset space",showlabels=True,folder="../results/"+project+"/plots/"+version+"_")
res.to_csv('../results/'+project+'/'+version+'_biopathway_gsea.csv')
res = pd.read_csv('../results/'+project+'/'+version+'_biopathway_gsea.csv',index_col=0)
data = pd.DataFrame(index=results['RNP_SP1'].index.tolist())
for i, v in results.items():
data[i]=v.log2FoldChange
model = AgglomerativeClustering(n_clusters=8,linkage="average",
affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(data.values.T)
ii = itertools.count(data.values.shape[1])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
model = DBSCAN(eps=0.5, min_samples=3, metric='euclidean')
labels = model.fit_predict(data.values)
a = sns.clustermap(data.corr())
a.savefig("../results/"+project+"/plots/"+version+"_clustermap_correlation_transcriptome.pdf")
a = h.plotCorrelationMatrix(data.values.T[sort], data.columns[sort].tolist(), interactive=True, title="transcriptome correlation",folder= "../results/"+project+"/plots/"+version+"_")#,colors=[labels[i] for i in sort])
## Filtered version (set to 0 genes with low p_value)
data = pd.DataFrame(index=results['RNP_SP1'].index.tolist())
for i, v in results.items():
v.loc[v[v.pvalue>0.01].index,"log2FoldChange"]==0
data[i]=v.log2FoldChange
a = h.plotCorrelationMatrix(data.values.T[sort], data.columns[sort].tolist(), interactive=True, title="transcriptome correlation matrix over high pvalue genes",folder="../results/"+project+"/plots/"+version+"_")
mostvar = data[(~data.index.str.contains('ERCC-')) & (data.var(1)>4)]
a = sns.clustermap(-mostvar.corr(), cmap=plt.cm.RdYlBu, vmin=-1, vmax=1)
a.savefig("../results/"+project+"/plots/"+version+"_clustermap_mostvariable_genes.pdf")
results_ctf = data.loc[set(['MYC',
'MYB',
'SPI1',
'RUNX1',
'IRF2BP2',
'FLI1',
'ELF2',
'ZEB2',
'GFI1',
'LMO2',
'CEBPa',
'MEF2D',
'MEF2C',
'IRF8',
'MEIS1',
'RUNX2',
'ZMYND8']) & set(data.index)].T
model = AgglomerativeClustering(n_clusters=7,linkage="average",
affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(results_ctf)
ii = itertools.count(results_ctf.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
h.plotCorrelationMatrix(results_ctf.values[sort], title="correlation matrix over similarity over CRC members aggregated with agglomerative clustering", folder="../results/"+project+'/plots/'+version+'_',names=results_ctf.index[sort].tolist(),interactive=True)
a = sns.clustermap(results_ctf.corr())
a.set_title("clustermap of correlation matrix over similarity over CRC members")
a.figure.savefig('../results/'+project+'/plots/'+version+"_clustermap_correlation_CRConly.pdf")
a.fig.savefig('../results/'+project+'/plots/'+version+"_pairplot_deseq_crconly.pdf")
a = sns.pairplot(results_ctf.T)
a.fig.suptitle("pairplot of differential expression of CRC members under RNP of these members", y=1.08)